import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo
%matplotlib inline
plt.style.use('seaborn')
pyo.init_notebook_mode()
df = pd.read_csv('2016_Building_Energy_Benchmarking.csv')
df.head()
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | Address | City | State | ZipCode | TaxParcelIdentificationNumber | ... | Electricity(kWh) | Electricity(kBtu) | NaturalGas(therms) | NaturalGas(kBtu) | DefaultData | Comments | ComplianceStatus | Outlier | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2016 | NonResidential | Hotel | Mayflower park hotel | 405 Olive way | Seattle | WA | 98101.0 | 0659000030 | ... | 1.156514e+06 | 3946027.0 | 12764.52930 | 1276453.0 | False | NaN | Compliant | NaN | 249.98 | 2.83 |
| 1 | 2 | 2016 | NonResidential | Hotel | Paramount Hotel | 724 Pine street | Seattle | WA | 98101.0 | 0659000220 | ... | 9.504252e+05 | 3242851.0 | 51450.81641 | 5145082.0 | False | NaN | Compliant | NaN | 295.86 | 2.86 |
| 2 | 3 | 2016 | NonResidential | Hotel | 5673-The Westin Seattle | 1900 5th Avenue | Seattle | WA | 98101.0 | 0659000475 | ... | 1.451544e+07 | 49526664.0 | 14938.00000 | 1493800.0 | False | NaN | Compliant | NaN | 2089.28 | 2.19 |
| 3 | 5 | 2016 | NonResidential | Hotel | HOTEL MAX | 620 STEWART ST | Seattle | WA | 98101.0 | 0659000640 | ... | 8.115253e+05 | 2768924.0 | 18112.13086 | 1811213.0 | False | NaN | Compliant | NaN | 286.43 | 4.67 |
| 4 | 8 | 2016 | NonResidential | Hotel | WARWICK SEATTLE HOTEL (ID8) | 401 LENORA ST | Seattle | WA | 98121.0 | 0659000970 | ... | 1.573449e+06 | 5368607.0 | 88039.98438 | 8803998.0 | False | NaN | Compliant | NaN | 505.01 | 2.88 |
5 rows × 46 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3376 entries, 0 to 3375 Data columns (total 46 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OSEBuildingID 3376 non-null int64 1 DataYear 3376 non-null int64 2 BuildingType 3376 non-null object 3 PrimaryPropertyType 3376 non-null object 4 PropertyName 3376 non-null object 5 Address 3376 non-null object 6 City 3376 non-null object 7 State 3376 non-null object 8 ZipCode 3360 non-null float64 9 TaxParcelIdentificationNumber 3376 non-null object 10 CouncilDistrictCode 3376 non-null int64 11 Neighborhood 3376 non-null object 12 Latitude 3376 non-null float64 13 Longitude 3376 non-null float64 14 YearBuilt 3376 non-null int64 15 NumberofBuildings 3368 non-null float64 16 NumberofFloors 3376 non-null int64 17 PropertyGFATotal 3376 non-null int64 18 PropertyGFAParking 3376 non-null int64 19 PropertyGFABuilding(s) 3376 non-null int64 20 ListOfAllPropertyUseTypes 3367 non-null object 21 LargestPropertyUseType 3356 non-null object 22 LargestPropertyUseTypeGFA 3356 non-null float64 23 SecondLargestPropertyUseType 1679 non-null object 24 SecondLargestPropertyUseTypeGFA 1679 non-null float64 25 ThirdLargestPropertyUseType 596 non-null object 26 ThirdLargestPropertyUseTypeGFA 596 non-null float64 27 YearsENERGYSTARCertified 119 non-null object 28 ENERGYSTARScore 2533 non-null float64 29 SiteEUI(kBtu/sf) 3369 non-null float64 30 SiteEUIWN(kBtu/sf) 3370 non-null float64 31 SourceEUI(kBtu/sf) 3367 non-null float64 32 SourceEUIWN(kBtu/sf) 3367 non-null float64 33 SiteEnergyUse(kBtu) 3371 non-null float64 34 SiteEnergyUseWN(kBtu) 3370 non-null float64 35 SteamUse(kBtu) 3367 non-null float64 36 Electricity(kWh) 3367 non-null float64 37 Electricity(kBtu) 3367 non-null float64 38 NaturalGas(therms) 3367 non-null float64 39 NaturalGas(kBtu) 3367 non-null float64 40 DefaultData 3376 non-null bool 41 Comments 0 non-null float64 42 ComplianceStatus 3376 non-null object 43 Outlier 32 non-null object 44 TotalGHGEmissions 3367 non-null float64 45 GHGEmissionsIntensity 3367 non-null float64 dtypes: bool(1), float64(22), int64(8), object(15) memory usage: 1.2+ MB
df.drop(['DataYear', 'City', 'State', 'ZipCode',
'TaxParcelIdentificationNumber', 'CouncilDistrictCode',
'Address', 'YearsENERGYSTARCertified', 'Comments',
'Electricity(kWh)', 'NaturalGas(therms)'],
axis=1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3376 entries, 0 to 3375 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OSEBuildingID 3376 non-null int64 1 BuildingType 3376 non-null object 2 PrimaryPropertyType 3376 non-null object 3 PropertyName 3376 non-null object 4 Neighborhood 3376 non-null object 5 Latitude 3376 non-null float64 6 Longitude 3376 non-null float64 7 YearBuilt 3376 non-null int64 8 NumberofBuildings 3368 non-null float64 9 NumberofFloors 3376 non-null int64 10 PropertyGFATotal 3376 non-null int64 11 PropertyGFAParking 3376 non-null int64 12 PropertyGFABuilding(s) 3376 non-null int64 13 ListOfAllPropertyUseTypes 3367 non-null object 14 LargestPropertyUseType 3356 non-null object 15 LargestPropertyUseTypeGFA 3356 non-null float64 16 SecondLargestPropertyUseType 1679 non-null object 17 SecondLargestPropertyUseTypeGFA 1679 non-null float64 18 ThirdLargestPropertyUseType 596 non-null object 19 ThirdLargestPropertyUseTypeGFA 596 non-null float64 20 ENERGYSTARScore 2533 non-null float64 21 SiteEUI(kBtu/sf) 3369 non-null float64 22 SiteEUIWN(kBtu/sf) 3370 non-null float64 23 SourceEUI(kBtu/sf) 3367 non-null float64 24 SourceEUIWN(kBtu/sf) 3367 non-null float64 25 SiteEnergyUse(kBtu) 3371 non-null float64 26 SiteEnergyUseWN(kBtu) 3370 non-null float64 27 SteamUse(kBtu) 3367 non-null float64 28 Electricity(kBtu) 3367 non-null float64 29 NaturalGas(kBtu) 3367 non-null float64 30 DefaultData 3376 non-null bool 31 ComplianceStatus 3376 non-null object 32 Outlier 32 non-null object 33 TotalGHGEmissions 3367 non-null float64 34 GHGEmissionsIntensity 3367 non-null float64 dtypes: bool(1), float64(18), int64(6), object(10) memory usage: 900.2+ KB
building_dic = {'NonResidential': 'NonResidential',
'Nonresidential COS': 'NonResidential',
'Multifamily MR (5-9)': 'Residential',
'SPS-District K-12': 'School',
'Campus': 'Campus',
'Multifamily LR (1-4)': 'Residential',
'Multifamily HR (10+)': 'Residential',
'Nonresidential WA': 'NonResidential'}
df['BuildingType'] = df['BuildingType'].map(building_dic)
df = df[df['BuildingType'] != 'Residential'].copy()
df = df[df['ENERGYSTARScore'].notna()].copy()
plt.figure(figsize=(12,12), facecolor="#eaeaf2")
sns.histplot(data=df, y='PrimaryPropertyType', hue='LargestPropertyUseType',
multiple='stack');
plt.figure(figsize=(12,12), facecolor="#eaeaf2")
sns.boxplot(data=df, y='PrimaryPropertyType', x='SiteEUI(kBtu/sf)');
The building with "Parking" as largestpropertyuse will be change based on info find with GoogleMaps.
In addition, one hospital will be labelled as Senior Care Community and one Senior Care Community will be labelled as Hospital
df.loc[308, 'LargestPropertyUseType'] = "Office"
df.loc[309, 'LargestPropertyUseType'] = "Data Center"
df.loc[2800, 'LargestPropertyUseType'] = "Supermarket/Grocery Store"
df.loc[2095, 'LargestPropertyUseType'] = "Retail Store" #Automibili Italine
df.loc[246, 'LargestPropertyUseType'] = "Office" #metropolitan park
df.loc[488, 'LargestPropertyUseType'] = "Office" # West lake SEDO
df.loc[1306, 'LargestPropertyUseType'] = "Retail Store"# Elliot court
df.loc[1147, 'LargestPropertyUseType'] = "Hotel" #Palladian Hotel
df.loc[2414, 'LargestPropertyUseType'] = "Office" #Talon Northlake
df.loc[601, 'LargestPropertyUseType'] = "Supermarket/Grocery Store"
df.loc[1109, 'LargestPropertyUseType'] = 'Hospital (General Medical & Surgical)' #Ballard Center
df.loc[2482, "LargestPropertyUseType"] = "Senior Care Community"
plt.figure(figsize=(12,18), facecolor="#eaeaf2")
sns.boxplot(data=df, y='LargestPropertyUseType', x='SiteEUI(kBtu/sf)');
We can see on the boxplot that the LargestPropertyUseType will be an important column for the predictions
In addition to the change we've already made, we are going to group a few columns together
df['LargestPropertyUseType'] = df['LargestPropertyUseType'].replace('Distribution Center', 'Warehouse')
df['LargestPropertyUseType'] = df['LargestPropertyUseType'].replace('Refrigerated Warehouse', 'Warehouse')
df['LargestPropertyUseType'] = df['LargestPropertyUseType'].replace('Non-Refrigerated Warehouse', 'Warehouse')
df['LargestPropertyUseType'] = df['LargestPropertyUseType'].replace('Wholesale Club/Supercenter', 'Retail Store')
df['LargestPropertyUseType'] = df['LargestPropertyUseType'].replace('Other', 'Retail Store')
df['LargestPropertyUseType'] = df['LargestPropertyUseType'].replace('Bank Branch', 'Bank/Finance/Courthouse')
df['LargestPropertyUseType'] = df['LargestPropertyUseType'].replace('Financial Office', 'Bank/Finance/Courthouse')
df['LargestPropertyUseType'] = df['LargestPropertyUseType'].replace('Courthouse', 'Bank/Finance/Courthouse')
df.LargestPropertyUseType.value_counts()
Office 416 Warehouse 191 K-12 School 125 Retail Store 83 Hotel 73 Worship Facility 65 Supermarket/Grocery Store 40 Medical Office 37 Residence Hall/Dormitory 19 Senior Care Community 18 Hospital (General Medical & Surgical) 10 Bank/Finance/Courthouse 9 Data Center 4 Multifamily Housing 4 Name: LargestPropertyUseType, dtype: int64
df = df[df['LargestPropertyUseType'] != 'Multifamily Housing']
drop_list = ['ListOfAllPropertyUseTypes',
'PrimaryPropertyType',
'LargestPropertyUseTypeGFA',
'SecondLargestPropertyUseType',
'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseType',
'ThirdLargestPropertyUseTypeGFA']
df.drop(drop_list, axis=1, inplace=True)
plt.figure(figsize=(12,6), facecolor="#eaeaf2")
sns.boxplot(data=df, x='ComplianceStatus', y='SourceEUI(kBtu/sf)')
plt.ylim(0,500);
df.ComplianceStatus.value_counts()
Compliant 993 Error - Correct Default Data 88 Non-Compliant 9 Name: ComplianceStatus, dtype: int64
df = df[df['ComplianceStatus'] == 'Compliant'].copy()
df.Outlier.value_counts()
Series([], Name: Outlier, dtype: int64)
df.DefaultData.value_counts()
False 993 Name: DefaultData, dtype: int64
The "Non-compliant" building have a lower energy consumption. However we do not have enough info about the data to use them. We will only keep the data labelled as "Compliant"
Keeping only the "Compliant" building also suppressed the data labelled as outliers and the data where DefaultData were not False
df.drop(['ComplianceStatus', 'DefaultData', 'Outlier'],
axis=1, inplace=True)
df.Neighborhood.value_counts()
DOWNTOWN 243 GREATER DUWAMISH 200 LAKE UNION 101 MAGNOLIA / QUEEN ANNE 87 NORTHEAST 80 EAST 65 NORTHWEST 51 BALLARD 33 NORTH 31 CENTRAL 25 DELRIDGE 24 SOUTHEAST 21 SOUTHWEST 18 Northwest 4 Delridge 3 Ballard 3 North 3 Central 1 Name: Neighborhood, dtype: int64
df['Neighborhood'] = df['Neighborhood'].str.title()
df['Neighborhood'] = df['Neighborhood'].replace('Delridge.*', 'Delrigde', regex=True)
# all the name starting by "Delridge" are replaced by "Delridge", not necessary after the data selection
def plot_map(column):
fig = px.scatter_mapbox(df,
lat='Latitude',
lon='Longitude',
color=column,
hover_name='PropertyName')
fig.update_layout(mapbox_style='open-street-map')
title = 'map-' + column
pyo.iplot(fig, title)
plot_map('Neighborhood')
plot_map('SiteEUI(kBtu/sf)')
plot_map('GHGEmissionsIntensity')
df.drop(['Latitude', 'Longitude'],
axis=1, inplace=True)
plt.figure(figsize=(12,4), facecolor="#eaeaf2")
sns.countplot(data=df, x='NumberofBuildings', color='#7490c0');
la plupart des terrains n'ont qu'un batiment. Nous allons completer les valeurs manquantes et les zeros par cette valeur
df['NumberofBuildings'].fillna(value=1, inplace=True)
df['NumberofBuildings'].replace(0,1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 993 entries, 0 to 3347 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OSEBuildingID 993 non-null int64 1 BuildingType 993 non-null object 2 PropertyName 993 non-null object 3 Neighborhood 993 non-null object 4 YearBuilt 993 non-null int64 5 NumberofBuildings 993 non-null float64 6 NumberofFloors 993 non-null int64 7 PropertyGFATotal 993 non-null int64 8 PropertyGFAParking 993 non-null int64 9 PropertyGFABuilding(s) 993 non-null int64 10 LargestPropertyUseType 993 non-null object 11 ENERGYSTARScore 993 non-null float64 12 SiteEUI(kBtu/sf) 993 non-null float64 13 SiteEUIWN(kBtu/sf) 992 non-null float64 14 SourceEUI(kBtu/sf) 993 non-null float64 15 SourceEUIWN(kBtu/sf) 993 non-null float64 16 SiteEnergyUse(kBtu) 993 non-null float64 17 SiteEnergyUseWN(kBtu) 992 non-null float64 18 SteamUse(kBtu) 993 non-null float64 19 Electricity(kBtu) 993 non-null float64 20 NaturalGas(kBtu) 993 non-null float64 21 TotalGHGEmissions 993 non-null float64 22 GHGEmissionsIntensity 993 non-null float64 dtypes: float64(13), int64(6), object(4) memory usage: 186.2+ KB
fig, ax = plt.subplots(1,2,figsize=(12,4), facecolor="#eaeaf2")
sns.scatterplot(data=df, ax=ax[0], x='SiteEUI(kBtu/sf)', y='SiteEUIWN(kBtu/sf)')
sns.scatterplot(data=df, ax=ax[1], x='SourceEUI(kBtu/sf)', y='SourceEUIWN(kBtu/sf)');
df.drop(['SiteEUIWN(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUseWN(kBtu)'],
axis=1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 993 entries, 0 to 3347 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OSEBuildingID 993 non-null int64 1 BuildingType 993 non-null object 2 PropertyName 993 non-null object 3 Neighborhood 993 non-null object 4 YearBuilt 993 non-null int64 5 NumberofBuildings 993 non-null float64 6 NumberofFloors 993 non-null int64 7 PropertyGFATotal 993 non-null int64 8 PropertyGFAParking 993 non-null int64 9 PropertyGFABuilding(s) 993 non-null int64 10 LargestPropertyUseType 993 non-null object 11 ENERGYSTARScore 993 non-null float64 12 SiteEUI(kBtu/sf) 993 non-null float64 13 SourceEUI(kBtu/sf) 993 non-null float64 14 SiteEnergyUse(kBtu) 993 non-null float64 15 SteamUse(kBtu) 993 non-null float64 16 Electricity(kBtu) 993 non-null float64 17 NaturalGas(kBtu) 993 non-null float64 18 TotalGHGEmissions 993 non-null float64 19 GHGEmissionsIntensity 993 non-null float64 dtypes: float64(10), int64(6), object(4) memory usage: 162.9+ KB
fig, ax = plt.subplots(1,2,figsize=(12,6))
sns.histplot(data=df, x='SiteEUI(kBtu/sf)', ax=ax[0])
sns.histplot(data=df, x='GHGEmissionsIntensity', ax=ax[1]);
df[df['GHGEmissionsIntensity'] > 15]
| OSEBuildingID | BuildingType | PropertyName | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | LargestPropertyUseType | ENERGYSTARScore | SiteEUI(kBtu/sf) | SourceEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 35 | 43 | Campus | Plant 2 Site | Greater Duwamish | 1992 | 1.0 | 3 | 494835 | 0 | 494835 | Office | 8.0 | 221.699997 | 388.200012 | 448385312.0 | 0.0 | 150476283.0 | 297909000.0 | 16870.98 | 34.09 |
| 3264 | 49940 | NonResidential | Virginia Mason Medical Center - 2149 | East | 1920 | 1.0 | 8 | 374466 | 0 | 374466 | Hospital (General Medical & Surgical) | 74.0 | 162.300003 | 367.899994 | 163945984.0 | 73885472.0 | 90060497.0 | 0.0 | 6330.91 | 16.91 |
df[df['SiteEnergyUse(kBtu)'] <= 0]
| OSEBuildingID | BuildingType | PropertyName | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | LargestPropertyUseType | ENERGYSTARScore | SiteEUI(kBtu/sf) | SourceEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity |
|---|
df[df['TotalGHGEmissions'] <= 0]
| OSEBuildingID | BuildingType | PropertyName | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | LargestPropertyUseType | ENERGYSTARScore | SiteEUI(kBtu/sf) | SourceEUI(kBtu/sf) | SiteEnergyUse(kBtu) | SteamUse(kBtu) | Electricity(kBtu) | NaturalGas(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 513 | 700 | NonResidential | IUC- Whole Foods Interbay | Magnolia / Queen Anne | 2008 | 1.0 | 1 | 57176 | 0 | 57176 | Supermarket/Grocery Store | 31.0 | 208.800003 | 0.0 | 1.252517e+07 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 |
| 3206 | 49784 | NonResidential | Bullitt Center | Central | 2013 | 1.0 | 6 | 52000 | 0 | 52000 | Office | 100.0 | 7.100000 | 2.0 | 3.427261e+05 | 0.0 | -115417.0 | 0.0 | -0.8 | -0.02 |
The bullitt center is a low carbon emissions building. It was designed to reduce its carbon footprint. The solar panel on its top allows it to produce more electricity than it consumes.
We are going to remove these outliers
df = df[df['TotalGHGEmissions'] > 0].copy()
df['SteamRatio'] = df['SteamUse(kBtu)'] / df['SiteEnergyUse(kBtu)']
df['ElectricityRatio'] = df['Electricity(kBtu)'] / df['SiteEnergyUse(kBtu)']
df['NaturalGasRatio'] = df['NaturalGas(kBtu)'] / df['SiteEnergyUse(kBtu)']
df['SteamRatio10'] = (df['SteamRatio'] * 100 + 9) // 10
df['ElectricityRatio10'] = (df['ElectricityRatio'] * 100 + 9) // 10
df['NaturalGasRatio10'] = (df['NaturalGasRatio'] * 100 + 9) // 10
sns.boxplot(data=df, x='SteamRatio10', y='GHGEmissionsIntensity')
plt.ylim([-1,15])
plt.xlim([0.5,8]);
sns.boxplot(data=df, x='ElectricityRatio10', y='GHGEmissionsIntensity')
plt.ylim([-1,15])
plt.xlim([0.5,10.5]);
sns.boxplot(data=df, x='NaturalGasRatio10', y='GHGEmissionsIntensity')
plt.ylim([-1,15])
plt.xlim([0.5, 10.5]);
We can see that electicity is the cleanest energy source. GHG emissions increase with natural gas and steam use but decrease when electricity is the main source of energy
fig, ax = plt.subplots(1,2,figsize=(12,4))
sns.scatterplot(data=df, x='SiteEUI(kBtu/sf)', y='NumberofFloors', ax=ax[0])
sns.scatterplot(data=df, x='PropertyGFABuilding(s)', y='SiteEUI(kBtu/sf)', hue='NumberofBuildings', ax=ax[1]);
Larger building seems to require less energy
A building with multiple floors have less exposed surface area % compared to a one floor building. Threfore, it requires less energy to heat in winter
df['NumberofBuildings'].value_counts()[:2]
1.0 979 2.0 3 Name: NumberofBuildings, dtype: int64
df['NumberofFloors'].value_counts()[10:11]
0 10 Name: NumberofFloors, dtype: int64
There is ten building with 0 floors and many without building despite having an energy consumption.
We are going to replace these value by 1 to allow us to create the surface per floor and per building columns
df['NumberofBuildings'] = df['NumberofBuildings'].replace(0,1)
df['NumberofFloors'] = df['NumberofFloors'].replace(0,1)
df['SurfacePerBuilding'] = df['PropertyGFABuilding(s)'] / df['NumberofBuildings']
df['SurfacePerFloor'] = df['PropertyGFABuilding(s)'] / df['NumberofFloors']
df['ParkingRatio'] = df['PropertyGFAParking'] / df['PropertyGFATotal']
df['BuildingRatio'] = df['PropertyGFABuilding(s)'] / df['PropertyGFATotal']
corr = df.corr()[1:]
plt.figure(figsize=(12,12))
sns.heatmap(corr, annot=True, cbar=None, fmt='.2f', cmap='winter_r');
plt.figure(figsize=(12,6), facecolor="#eaeaf2")
df['Decade'] = df['YearBuilt'] // 10
df['Decade'] = df['Decade'] * 10
sns.barplot(data=df, x='Decade', y='ENERGYSTARScore', color='#7490c0');
plt.figure(figsize=(12,6))
sns.histplot(data=df, x='ENERGYSTARScore');
fig = px.scatter(df, x='ENERGYSTARScore', y='SiteEUI(kBtu/sf)', trendline='ols')
pyo.iplot(fig, 'scatter-energystar-site-eui')
fig = px.scatter(df, x='ENERGYSTARScore', y='GHGEmissionsIntensity', trendline='ols')
pyo.iplot(fig, 'scatter-plot-energystar-ghg')
It is difficult to see the correlation between the variables using a scatterplot. We are going to look at the data again using barplots and boxplots to confirm the trend showed by the regression lines
custom_palette = ["#ef3e22", "#f58220", '#feca0a', '#7fc241', "#008b4c"]
sns.palplot(custom_palette)
df['ESScoreGroups'] = (df['ENERGYSTARScore']+19) // 20 #creation de cinq groupes allant de 1 a 5
#les données sont décalées pour éviter la création d'un groupe 0
plt.figure(figsize=(12,6))
sns.barplot(data=df, x='ESScoreGroups', y='SiteEUI(kBtu/sf)', palette=custom_palette);
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='ESScoreGroups', y='SiteEUI(kBtu/sf)', palette=custom_palette)
plt.ylim([-2,400]);
plt.figure(figsize=(12,6))
sns.barplot(data=df, x='ESScoreGroups', y='GHGEmissionsIntensity', palette=custom_palette);
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='ESScoreGroups', y='GHGEmissionsIntensity', palette=custom_palette)
plt.ylim([-2, 10]);
We can easily see the correlation between the ENERGYSTAR score and the energy consumption
In the barplot, the average energie consumption is half for the group 5 building, those with the highest ENERGYSTAR scores, compared to the group with the lowest scores. The GHG emissions are 3 times higher for the low ENERGYSTAR scores buildings
final = df.drop([
'SourceEUI(kBtu/sf)',
'SteamUse(kBtu)', 'Electricity(kBtu)', 'NaturalGas(kBtu)',
'SteamRatio10', 'ElectricityRatio10', 'NaturalGasRatio10',
'Decade', 'ESScoreGroups',
],
axis=1)
final.to_csv('seattle_clean2.csv', index=None)